In [1]:
    
# preambles
import networkx as nx
import cPickle as pickle
import os
# load pickled network file
pickle_name = 'Data_world8_network.pckl'
pickle_dir = 'C:\\Users\\FG\\Desktop\\PhD\\Research\\reddit\\Pickled Data'
reddit_network = pickle.load( open(pickle_dir + os.sep + pickle_name, "rb") )
print "loaded reddit network from ", pickle_dir + os.sep + pickle_name
    
    
In [2]:
    
import matplotlib.pyplot as plt
%matplotlib inline
    
In [3]:
    
#create graph
reddit_graph = nx.Graph(name='Reddit Graph')
# add edges (nodes added automatically)
for userA in reddit_network.keys():
    for userB in reddit_network[userA].keys():
        # add only if enough messages between the two users
        if len(reddit_network[userA][userB])>=2 and (userA != userB):
            reddit_graph.add_edge(userA, userB)
#             print reddit_graph[userA][userB]['polarity']
# save largest connected subgraph
reddit_graph = sorted(nx.connected_component_subgraphs(reddit_graph), key = len, reverse=True)[0]
    
In [4]:
    
# Stats
# print 'degrees:',nx.degree_histogram(reddit_graph)
print 'info: ', nx.info(reddit_graph)
print 'density: ', nx.density(reddit_graph), ' (0-1 scale, 0 for empty graph, 1 for complete graph)'
# triangles
triangles_dict = nx.triangles(reddit_graph)
nodes_in_triangles = [node for node in reddit_graph.nodes() if triangles_dict[node]>=1]
print '# of triangles in graph:', sum(nx.triangles(reddit_graph).values())/3
# plot the degre dist
print "Degree Distribution"
hist=nx.degree_histogram(reddit_graph)
plt.bar(range(len(hist)), hist, align='center')
plt.xlim(0,30)
plt.show()
    
    
    
In [5]:
    
# extract list of triangles
triangle_list=[] 
done=set()  
for n in reddit_graph: 
    done.add(n)    # 
    nbrdone=set()    # 
    nbrs=set(reddit_graph[n]) 
    for nbr in nbrs: 
        if nbr in done:    # 
            continue    # 
        nbrdone.add(nbr)    # 
        for both in nbrs.intersection(reddit_graph[nbr]): 
            if both in done or both in nbrdone:    # 
                continue    # 
            triangle_list.append( (n,nbr,both) )
print len(triangle_list)
    
    
In [6]:
    
#reconstruct graph from triangles (with only edges in those triangles)
from itertools import combinations
triangles_reddit_graph = nx.Graph(name='Triangles Reddit Graph')
for triangle in triangle_list:
    for n1, n2 in combinations(triangle,2):
        # add edge 
         triangles_reddit_graph.add_edge(n1, n2)
    
In [7]:
    
# Stats
# print 'degrees:',nx.degree_histogram(reddit_graph)
print 'info: ', nx.info(triangles_reddit_graph)
print 'density: ', nx.density(triangles_reddit_graph), ' (0-1 scale, 0 for empty graph, 1 for complete graph)'
# triangles
triangles_dict = nx.triangles(triangles_reddit_graph)
nodes_in_triangles = [node for node in triangles_reddit_graph.nodes() if triangles_dict[node]>=1]
print '# of triangles in graph:', sum(nx.triangles(triangles_reddit_graph).values())/3
# plot the degre dist
print "Degree Distribution"
hist=nx.degree_histogram(triangles_reddit_graph)
plt.bar(range(len(hist)), hist, align='center')
plt.xlim(0,30)
plt.show()
    
    
    
In [8]:
    
# distribution of types of triangles in a random network
# right half of table 4 in paper
    
In [9]:
    
import random
# # randomize edges
for edge in triangles_reddit_graph.edges():
#     #-1 for neg, +1 for pos
    triangles_reddit_graph.edge[edge[0]][edge[1]]['polarity'] = 2*random.randint(0,1)-1
    
In [10]:
    
# analyse triangle types
from itertools import combinations
# counts of +++, ++-, +--, --- triangles respectively
counts = {'+++':0, '++-':0, '+--':0, '---':0}
for triangle in triangle_list:
    pairs  = combinations(triangle, 2)
    tot = 0
    for pair in pairs:
        tot += triangles_reddit_graph.edge[pair[0]][pair[1]]['polarity']
    
    if tot == 3:
        counts['+++'] += 1
    elif tot == 1:
        counts['++-'] += 1
    elif tot == -1:
         counts['+--'] += 1
    elif tot == -3:
        counts['---'] += 1
    else:
        print "ERROR"
print "counts in a randomized network: ", counts
    
    
In [11]:
    
# apply opinion finder to each edge
import string
for edge in triangles_reddit_graph.edges():
    # extract text data
    userA, userB = edge[0],edge[1]
    # print userA, userB
    text_list = reddit_network[userA][userB]
    #print text_list
    
    # save text to temp file
    with open("tmp.txt", "w") as text_file:
        for msg in text_list: 
            # remove non-ascii for opinionfinder
            msg_filtered = filter(lambda x: x in string.printable, msg)
#             print msg_filtered
            text_file.write(msg_filtered + '\n')
            
    # analyse polarity
    os.system('java -classpath opinionfinderv2.0\lib\weka.jar;opinionfinderv2.0\lib\stanford-postagger.jar;opinionfinderv2.0\opinionfinder.jar opin.main.RunOpinionFinder tmp.txt -m opinionfinderv2.0\models -l  opinionfinderv2.0\lexicons')
    
    # open polarity results
    with open("tmp.txt_auto_anns\markup.txt", "r") as res_file:
        content = res_file.read()
        positives = content.count("positive")
        negatives = content.count("negative")
        neutrals = content.count("neutral")
        print userA, userB, positives, negatives, neutrals
        
        # assign polarity
        if negatives>positives:
            polarity_val = -1
        elif negatives<positives:
            polarity_val = 1
        else:
            polarity_val = 0
        
        triangles_reddit_graph.edge[userA][userB]['polarity'] = polarity_val
print "DONE"
    
    
In [12]:
    
# analyse triangle types
# counts of +++, ++-, +--, --- triangles respectively
counts = {'+++':0, '++-':0, '+--':0, '---':0}
for triangle in triangle_list:
    pairs  = combinations(triangle, 2)
    tot = 0
    for pair in pairs:
        tot += triangles_reddit_graph.edge[pair[0]][pair[1]]['polarity']
    
    if tot == 3:
        counts['+++'] += 1
    elif tot == 1:
        counts['++-'] += 1
    elif tot == -1:
         counts['+--'] += 1
    elif tot == -3:
        counts['---'] += 1
    else:
       pass
print "counts in a labeled network: ", counts
    
    
In [38]:
    
# Subgroup detection aproximation algorithm
# param from paper
alpha = 0.5
max_cluster_num = 5
#network to work on
work_graph = triangles_reddit_graph.copy()
best_cluster_graph = work_graph.copy()
best_price = float("inf")
# for each cluster size >=2
for t in range(max_cluster_num+1)[2:]:
    
    # partition network into t clusters randomly
    # (assign random value to node)
    for node in work_graph.nodes():
        work_graph.node[node]['cluster'] = random.randint(1,t)
         
    # compute price of partition
    cur_price = 0
    for edge in work_graph.edges():
        userA, userB, pol = edge[0], edge[1], work_graph.edge[edge[0]][edge[1]]['polarity']
        
        if pol == -1 and  (work_graph.node[userA]['cluster'] == work_graph.node[userB]['cluster']):
            # same group, disagreement
            cur_price += (1-alpha)
        elif pol == 1 and  (work_graph.node[userA]['cluster'] != work_graph.node[userB]['cluster']):
            # dif groups, agreement
            cur_price += alpha
        else:
            # do not increase price
            pass
    print "clusters: ", t, "initial price", cur_price
    
    if cur_price < best_price:
        best_cluster_graph = work_graph.copy()
        best_price = cur_price
        
    # look at all neighbor clusters
    copy_graph = work_graph.copy()
    
    # while neigbors are better
    neighbor_better = True
    while neighbor_better == True:
        
        # set loop break
        neighbor_better = False
        # -1 try each node as switch
        for switch_node in copy_graph.nodes():
            # save cluster value
            save_val = copy_graph.node[switch_node]['cluster']
            # change cluster value 
            for cluster_val in [x for x in range(max_cluster_num+1)[2:] if x != save_val]:
                # try new value
                copy_graph.node[node]['cluster'] = cluster_val
                #compute new price
                neigh_price = 0
                for edge in copy_graph.edges():
                    userA, userB, pol = edge[0], edge[1], copy_graph.edge[edge[0]][edge[1]]['polarity']
                    if pol == -1 and (copy_graph.node[userA]['cluster'] == copy_graph.node[userB]['cluster']):
                        # same group, disagreement
                        neigh_price += (1-alpha)
                    elif pol == 1 and (copy_graph.node[userA]['cluster'] != copy_graph.node[userB]['cluster']):
                        # dif groups, agreement
                        neigh_price += alpha
                    else:
                        # do not increase price
                        pass
                                    
                # assign new values if better
                if neigh_price < cur_price:
                    cur_price = neigh_price
                    best_switch = [switch_node, cluster_val]
                    neighbor_better = True
                if cur_price < best_price:
                    best_cluster_graph = copy_graph.copy()
                    best_price = cur_price
                # reset value
                copy_graph.node[switch_node]['cluster'] = save_val
        # -2 try each edge as switch
        for switch_edge in copy_graph.edges():
            # switch values
            copy_graph.node[switch_edge[0]]['cluster'],copy_graph.node[switch_edge[1]]['cluster'] = copy_graph.node[switch_edge[1]]['cluster'],copy_graph.node[switch_edge[0]]['cluster']
            #compute new price
            neigh_price = 0
            for edge in copy_graph.edges():
                userA, userB, pol = edge[0], edge[1], copy_graph.edge[edge[0]][edge[1]]['polarity']
                if pol == -1 and (copy_graph.node[userA]['cluster'] == copy_graph.node[userB]['cluster']):
                    # same group, disagreement
                    neigh_price += (1-alpha)
                elif pol == 1 and (copy_graph.node[userA]['cluster'] != copy_graph.node[userB]['cluster']):
                    # dif groups, agreement
                    neigh_price += alpha
                else:
                    # do not increase price
                    pass 
            # assign new price if better
            if neigh_price < cur_price:
                cur_price = neigh_price 
                best_switch = switch_edge
                neighbor_better = True
            if cur_price < best_price:
                best_cluster_graph = copy_graph.copy()
                best_price = cur_price
            # reset values
            copy_graph.node[switch_edge[0]]['cluster'],copy_graph.node[switch_edge[1]]['cluster'] = copy_graph.node[switch_edge[1]]['cluster'],copy_graph.node[switch_edge[0]]['cluster']
        #apply switch
        if type(best_switch) == tuple:
            # best switch is edge
            copy_graph.node[best_switch[0]]['cluster'],copy_graph.node[best_switch[1]]['cluster'] = copy_graph.node[best_switch[1]]['cluster'],copy_graph.node[best_switch[0]]['cluster']
        elif type(best_switch) == list:
            # best switch is node'
            copy_graph.node[best_switch[0]]['cluster'] = best_switch[1]
        
        print neighbor_better
        print "best neighbor",cur_price
    
    
In [44]:
    
print best_price
for node in best_cluster_graph.nodes():
    print node, best_cluster_graph.node[node]['cluster']
    
    
In [35]:
    
for edge in triangles_reddit_graph.edges():
    print type(edge)
    break
    
for node in triangles_reddit_graph.nodes():
    print type(node)
    break
print type(('test', 1)) == tuple
    
    
In [14]:
    
# !cd opinionfinderv2.0\
!java -classpath opinionfinderv2.0\lib\weka.jar;opinionfinderv2.0\lib\stanford-postagger.jar;opinionfinderv2.0\opinionfinder.jar opin.main.RunOpinionFinder opinionfinderv2.0\README.txt -m opinionfinderv2.0\models -l  opinionfinderv2.0\lexicons